Geolocation Stats

How many images are geolocated?

geolocations[
  , .(
    `Image Count` = .N
  )
  , keyby = .(`Image Available Ind` = ScanAvailable)
] %>%
  kable
Image Available Ind Image Count
FALSE 348779
TRUE 200924

How many flights have any available geolocations?

geolocations[
  , .(Flight_Geolocation_Ind = ifelse(sum(ScanAvailable) > 0, 1, 0))
  , keyby = .(FlightID)
][
  , .(`Flight Count` = .N)
  , keyby = .(`Flight Geolocation Ind` = Flight_Geolocation_Ind)
] %>%
  kable
Flight Geolocation Ind Flight Count
0 423
1 1288

What portion of each flight’s images are geolocated?

(geolocations[
  , .(Image_Count = .N, Geolocation_Count = sum(ScanAvailable))
  , keyby = .(FlightID)
][
  , .(
    Pct_Geolocated_Images = ifelse(
      Geolocation_Count == 0,
      -1,
      floor(Geolocation_Count / Image_Count * 100)
    )
  )
] %>%
  ggplot(
    .,
    aes(x = Pct_Geolocated_Images)
  ) +
  geom_bar() +
  labs(x = "Portions of each flight's images that are geolocated") +
  ylab('Flight Frequency') +
  theme_bw()) %>%
  ggplotly

Note: Going forward in this EDA, only the flights and images with available geolocations are considered.

geolocations <- geolocations[(ScanAvailable)]

Date Stats

How many distinct dates appear for each flight?

geolocations[
  , .(
    Distinct_Date_Count = data.table::uniqueN(Date)
  )
  , keyby = .(FlightID)
][
  , .(`Flight Frequency` = .N)
  , keyby = .(`Distinct Date Count` = Distinct_Date_Count)
] %>%
  kable
Distinct Date Count Flight Frequency
1 1276
2 7
3 1
6 1
73 1
83 1
114 1

What dates are observed for flights with multiple dates?

flights.multidate <- geolocations[
  , .(
    dateCount = data.table::uniqueN(Date)
  )
  , keyby = .(FlightID)
][
  dateCount > 1
]

flights.multidate[
  geolocations
  , on = .(FlightID)
  , nomatch = FALSE
][
  , .(
    Date.min = min(Date),
    Date.max = max(Date)
  )
  , keyby = .(FlightID)
][
  , .(
    FlightID, 
    `Date (min)` = Date.min, 
    `Date (max)`  = Date.max,
    `Date Range` = Date.max -  Date.min + 1
  )
][
  order(`Date Range`)
] %>%
  kable
FlightID Date (min) Date (max) Date Range
C_6650 1940-09-26 1940-09-27 2
AMI_VEN_75 1975-12-02 1975-12-09 8
AMI_LA_82 1982-01-23 1982-01-31 9
TA_CF 1966-12-16 1966-12-27 12
AMI_LA_86 1986-03-22 1986-05-04 44
AMI_SBD_85 1985-01-18 1985-03-25 67
AMI_SD_77 1977-01-17 1977-07-01 166
CAS_PLA 1962-01-01 1962-07-28 209
CAS_3390 1972-06-14 1973-01-23 224
NAPP 1987-06-16 1990-09-07 1180
NAPP_2C 1992-08-26 1996-09-30 1497
NAPP_3C 1998-08-03 2003-06-22 1785

Note: Going forward in this EDA, filtering out the 12 flights identifed above, unless otherwise stated.

geolocations.singledate_flights <- geolocations[
  !flights.multidate
  , on = .(FlightID)
]

Metrics by Time

How many images were recorded by year?

(geolocations.singledate_flights[
  , .(Image_Count = .N)
  , keyby = .(Year)
] %>% 
  ggplot(
    .,
    aes(x = Year, y = Image_Count)
  ) +
  geom_bar(stat = 'identity') +
  ylab('Image Count') +
  theme_bw()) %>%
  ggplotly

How did the average number of images per flight vary by year?

(geolocations.singledate_flights[
  , .(Image_Count = .N)
  , keyby = .(FlightID, Year)
][
  , .(
    Image_Count.mean = mean(Image_Count)
  )
  , keyby = .(Year)
] %>% 
  ggplot(
    .,
    aes(x = Year, y = Image_Count.mean)
  ) +
  geom_bar(stat = 'identity') +
  ylab('Average Number of Images per Flights') +
  theme_bw()) %>%
  ggplotly

During what quarters were images recorded?

geolocations.singledate_flights[
  , .(
    `Quarter Count` = .N
  )
  , keyby = .(Quarter = quarter(Date))
] %>%
  kable
Quarter Quarter Count
1 71944
2 49476
3 20690
4 28807

Did proportion of images by quarter change over time?

year.min <- geolocations.singledate_flights[, min(Year)]
year.max <- geolocations.singledate_flights[, max(Year)]
year.levels <- seq(
  from = year.min,
  to = year.max,
  by = 1
)

(geolocations.singledate_flights[
  , .(
    Quarter = factor(quarter(Date), levels = 1:4),
    Year = factor(Year, levels = year.levels),
    Date
  )
] %>%
  ggplot(
    .,
    aes(x = Year, fill = Quarter)
  ) +
  geom_bar(position = 'fill') +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_discrete(drop=FALSE) + 
  scale_x_discrete(drop=FALSE) +
  theme_bw()) %>%
  ggplotly %>%
  plotly::layout(
    xaxis = list(
      tickangle = 90
    ),
    legend = list(
      orientation = "h",
      xanchor = "center",
      x = 0.575,
      y = -0.2
    )
  )

Flight Metadata

How many images are there per flight?

geolocations.image_count <- geolocations.singledate_flights[
  , .(
    Image_Count = .N
  )
  , keyby = .(FlightID)
][
  , .(
    Frequency = .N
  )
  , keyby = .(Image_Count)
]

(geolocations.image_count %>%
  ggplot(
    .,
    aes(x = Image_Count, y = Frequency)
  ) +
  geom_bar(stat = 'identity') +
  xlab('Image Count') +
  xlim(c(0, 500)) +
  theme_bw()) %>%
  ggplotly

Image Scale

What are the most frequent image scales?

geolocations.scale_freq <- geolocations.singledate_flights[
  , .(Scale_Count = .N)
  , keyby = .(Scale)
][order(-Scale_Count)]

(geolocations.scale_freq[
  Scale_Count > 100
  , .(
    Scale = factor(Scale, levels = geolocations.scale_freq[['Scale']]),
    Scale_Count
  )
] %>%
  ggplot(
      .,
      aes(x = Scale, y = Scale_Count)
  ) +
  geom_bar(stat = 'identity') +
  ylab('Scale Count') +
  theme_bw()) %>%
  ggplotly %>%
  plotly::layout(
    xaxis = list(
      tickangle = 90
    )
  )

What is the most frequent image scale by year?

geolocations.singledate_flights[
  , .(Scale_Count = .N)
  , keyby = .(Scale, Year)
][
  order(Year, -Scale_Count)
  , freq_rank := 1:.N
  , by = .(Year)
][
  freq_rank == 1
  , .(Year, `Most Frequent Scale` = Scale)
][
  order(Year)
] %>%
  kable
Year Most Frequent Scale
1927 18000
1928 18000
1929 14400
1930 24000
1931 12000
1932 14400
1933 14400
1934 13500
1935 14400
1936 24000
1937 20000
1938 20000
1939 20000
1940 20000
1941 24000
1942 20000
1943 20000
1944 10000
1945 14400
1946 20000
1947 24000
1948 20000
1949 20000
1950 20000
1951 20000
1952 20000
1953 20000
1954 20000
1955 14400
1956 20000
1957 20000
1958 20000
1959 20000
1960 14400
1961 20000
1962 20000
1963 20000
1964 12000
1965 12000
1966 12000
1967 20000
1968 12000
1969 12000
1970 20000
1971 12000
1972 7200
1973 12000
1974 24000
1975 12000
1976 24000
1977 24000
1978 40000
1979 36000
1980 24000
1981 12000
1982 42000
1983 36000
1984 31680
1985 31680
1986 36000
1987 24000
1988 36000
1989 24000
1990 36000
1991 36000
1992 40000
1993 34600
1994 24000
1995 12000
1997 24000
1998 42000
1999 10800
2000 10800
2001 12000
2002 15000
2003 30000
2004 21000
2005 12000
2006 3600
2007 24000
2008 24000
2010 12000

Selected Flights

Which flights are between 1952-1965, with at least 500 images, and 1:20,000 in scale?

flights.selected <- geolocations.singledate_flights[
    Year %in% 1952:1965
    , .(
      Image_Count = .N
    )
    , keyby = .(FlightID, Year, Scale)
][
    Image_Count > 500 & Scale == 20000
    , .(
      FlightID,
      Year,
      Scale,
      Image_Count,
      Capped_Image_Count = ifelse(
          Image_Count > 1500,
          1500,
          500
      )
    )
]

flights.selected[
    order(-Image_Count)
] %>%
  kable
FlightID Year Scale Image_Count Capped_Image_Count
AXL_1953B 1952 20000 6445 1500
CAS_FRE 1965 20000 2509 1500
AXN_1953 1953 20000 2497 1500
ABL_1956 1956 20000 2493 1500
AXJ_1952 1952 20000 2357 1500
AXM_1953B 1953 20000 2337 1500
BTM_1954 1954 20000 1597 1500
AXL_1959 1959 20000 1373 500
ABK_1952 1952 20000 1369 500
CAS_SD 1963 20000 1252 500
AXJ_1959 1959 20000 1138 500
ABF_1957 1957 20000 1046 500
PAI_ABC 1952 20000 990 500
CSH_1953 1952 20000 950 500
AXI_1959 1959 20000 944 500
CAS_STAN 1963 20000 912 500
CIV_1956 1956 20000 837 500
ABD_1957 1957 20000 826 500
CAS_SCL 1963 20000 823 500
BTM_1961 1961 20000 769 500
ABB_1957 1957 20000 730 500
CAS_SAC 1961 20000 670 500
ABO_1957 1957 20000 611 500
AXC_1952 1952 20000 592 500
ABE_1957 1957 20000 590 500
ABL_1952 1952 20000 568 500
AXK_1953 1952 20000 539 500
BUT_1958 1958 20000 508 500

Where are the flights between 1952-1965, with at least 500 images, and 1:20,000 in scale?

geolocations.selected <- geolocations.singledate_flights[
  flights.selected
  , on = .(FlightID)
  , nomatch = FALSE
]

flightSpatialPolygonDataFrame <- createFlightSpatialPolygonDataFrame(
  geolocations.selected
)

flightPalette <- leaflet::colorFactor(
  colorRampPalette(
    RColorBrewer::brewer.pal(11,'Spectral')
  )(nrow(flights.selected)), 
  factor(flights.selected[, FlightID])
)

leaflet(
  flightSpatialPolygonDataFrame,
  options = leafletOptions(preferCanvas = TRUE)
) %>%
  addProviderTiles(
    providers$CartoDB.PositronNoLabels,
    group = 'CartoDB.NoLabels',
    options = providerTileOptions(
      updateWhenZooming = FALSE,
      updateWhenIdle = TRUE
    )
  ) %>%
  addPolygons(
    color = ~flightPalette(flight),
    label = ~flightYear
  )